{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b050edae-48cf-47ba-b2ae-bf9fc9098bb4",
   "metadata": {},
   "source": [
    "## Noise Injection for Text Augmentation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0ab61d1f-466f-4469-99da-b172a959c2cb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Author: Sebastian Raschka\n",
      "\n",
      "Python implementation: CPython\n",
      "Python version       : 3.10.6\n",
      "IPython version      : 8.12.0\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%load_ext watermark\n",
    "%watermark -a 'Sebastian Raschka' -v"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "47edf286-4825-4274-94b4-d10423767646",
   "metadata": {},
   "source": [
    "### Random Character Insertion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "cc137461-a113-4e62-8bf0-37ea2f5ebdb0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "import string\n",
    "\n",
    "\n",
    "def random_character_insertion(text, insertion_rate=0.1):\n",
    "    num_insertions = int(len(text) * insertion_rate)\n",
    "    \n",
    "    for _ in range(num_insertions):\n",
    "        position = random.randint(0, len(text))\n",
    "        character = random.choice(string.ascii_letters)\n",
    "        text = text[:position] + character + text[position:]\n",
    "\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "19443702-7195-4535-937b-d21ccf301bdd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Random Character Insertion: The Kcat jumped over the doZg.\n"
     ]
    }
   ],
   "source": [
    "random.seed(1)\n",
    "\n",
    "\n",
    "text = \"The cat jumped over the dog.\"\n",
    "augmented_text = random_character_insertion(text)\n",
    "print(\"Random Character Insertion:\", augmented_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "925d11d1-b3dc-4db5-b2dc-0a0228a13384",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  T\n",
      "  h\n",
      "  e\n",
      "   \n",
      "+ K\n",
      "  c\n",
      "  a\n",
      "  t\n",
      "   \n",
      "  j\n",
      "  u\n",
      "  m\n",
      "  p\n",
      "  e\n",
      "  d\n",
      "   \n",
      "  o\n",
      "  v\n",
      "  e\n",
      "  r\n",
      "   \n",
      "  t\n",
      "  h\n",
      "  e\n",
      "   \n",
      "  d\n",
      "  o\n",
      "+ Z\n",
      "  g\n",
      "  .\n"
     ]
    }
   ],
   "source": [
    "import difflib\n",
    "\n",
    "\n",
    "d = difflib.Differ()\n",
    "diff = d.compare(text, \n",
    "                 augmented_text)\n",
    "\n",
    "print('\\n'.join(diff))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "040eabc9-2583-4547-bafa-b5883382486b",
   "metadata": {},
   "source": [
    "### Random Character Deletion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "4f927974-c05c-4a70-b813-6423503b3711",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "def random_character_deletion(text, deletion_rate=0.1):\n",
    "\n",
    "    num_deletions = int(len(text) * deletion_rate)\n",
    "    \n",
    "    for _ in range(num_deletions):\n",
    "        if len(text) == 0:\n",
    "            break\n",
    "        position = random.randint(0, len(text) - 1)\n",
    "        text = text[:position] + text[position + 1:]\n",
    "\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "44409283-de8c-4a93-b27c-205c929823f6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Random Character Insertion: The at jumped overthe dog.\n"
     ]
    }
   ],
   "source": [
    "random.seed(1)\n",
    "\n",
    "\n",
    "text = \"The cat jumped over the dog.\"\n",
    "augmented_text = random_character_deletion(text)\n",
    "print(\"Random Character Insertion:\", augmented_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "a635453c-c1c8-4494-9c1a-d40fbaebb9fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  T\n",
      "  h\n",
      "  e\n",
      "   \n",
      "- c\n",
      "  a\n",
      "  t\n",
      "   \n",
      "  j\n",
      "  u\n",
      "  m\n",
      "  p\n",
      "  e\n",
      "  d\n",
      "   \n",
      "  o\n",
      "  v\n",
      "  e\n",
      "  r\n",
      "-  \n",
      "  t\n",
      "  h\n",
      "  e\n",
      "   \n",
      "  d\n",
      "  o\n",
      "  g\n",
      "  .\n"
     ]
    }
   ],
   "source": [
    "import difflib\n",
    "\n",
    "\n",
    "d = difflib.Differ()\n",
    "diff = d.compare(text, \n",
    "                 augmented_text)\n",
    "\n",
    "print('\\n'.join(diff))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8c82f867-7537-462a-a61a-f9063328d92d",
   "metadata": {},
   "source": [
    "### Typo Introduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "1be9b70a-e07a-4ce1-9e15-5a5457c25f3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "def typo_introduction(text, introduction_rate=0.1):\n",
    "    num_typos = int(len(text) * introduction_rate)\n",
    "    \n",
    "    for _ in range(num_typos):\n",
    "        # Ensure there are at least two characters to swap\n",
    "        if len(text) < 2:\n",
    "            break\n",
    "        position = random.randint(0, len(text) - 2)\n",
    "        text = text[:position] + text[position + 1] + text[position] + text[position + 2:]\n",
    "\n",
    "    return text\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "b3b7d05b-348c-42d7-9e34-3aafdbcef894",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Random Character Insertion: The act jumped ove rthe dog.\n"
     ]
    }
   ],
   "source": [
    "random.seed(1)\n",
    "\n",
    "\n",
    "text = \"The cat jumped over the dog.\"\n",
    "augmented_text = typo_introduction(text)\n",
    "print(\"Random Character Insertion:\", augmented_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "aaf83ea4-89bc-456b-b834-88bc1a3845cb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  T\n",
      "  h\n",
      "  e\n",
      "   \n",
      "+ a\n",
      "  c\n",
      "- a\n",
      "  t\n",
      "   \n",
      "  j\n",
      "  u\n",
      "  m\n",
      "  p\n",
      "  e\n",
      "  d\n",
      "   \n",
      "  o\n",
      "  v\n",
      "  e\n",
      "+  \n",
      "  r\n",
      "-  \n",
      "  t\n",
      "  h\n",
      "  e\n",
      "   \n",
      "  d\n",
      "  o\n",
      "  g\n",
      "  .\n"
     ]
    }
   ],
   "source": [
    "import difflib\n",
    "\n",
    "\n",
    "d = difflib.Differ()\n",
    "diff = d.compare(text, \n",
    "                 augmented_text)\n",
    "\n",
    "print('\\n'.join(diff))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01410981-b320-40ff-bebb-b77b416045fd",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
